////////////////////////////////////////////
////									////
////	STEP 1. Basic variable edits	////
////									////
////////////////////////////////////////////
	
* this file should be uniquely identified at the planID-period level

drop processingdate	// variable for version control only
drop clientid		// single-valued, pretty sure it means us

* format period as a monthly variable
gen m = ym(real(substr(period,1,4)), real(substr(period,-2,2)))
format m %tm
drop period

* destring IDs. For some, attach names as values. Otherwise, drop names
destring orgid, replace
rename orgid planid

foreach var in controller parent pbm formulary {
	qui replace `var'name = trim(upper(`var'name))
	qui destring `var'id, replace
	}

* change some variables to string
foreach var in channel plantype pbmrelationship benefitdesign notlistedpolicy nonformularypolicy {
	gen temp = trim(upper(`var'))
	drop `var'
	encode temp, gen(`var')
	drop temp
	}

////////////////////////////////////////////////////////////////////////////////			
////																		////
////	STEP 2. Edit inconsistencies across different pulls of the data		////
////																		////
////////////////////////////////////////////////////////////////////////////////
	
*** Changes in pull 2

* When there is no PBM the pbmname is marked "NO PBM" instead of being missing, 
* and pbmid is -1 instead of missing
replace pbmname = "NO PBM" if pbmname == ""
replace pbmid = -1 if pbmid == .

* pbmid: ENVISION PHARMACEUTICAL SERVICES has become ELIXIR PBM
replace pbmname = "ELIXIR PBM" if pbmid == 5997

* controllerid: 3 companies switched names
replace controllername = "ELIXIR PBM" if controllerid == 5997
replace controllername = "MOUNTAIN HEALTH CO-OP" if controllerid == 63560
replace controllername = "RIVERSPRING HEALTH PLANS" if controllerid == 65787

* parentid: same 3 companies switched names
replace parentname = "MOUNTAIN HEALTH CO-OP" if parentid == 63560
replace parentname = "ELIXIR INSURANCE" if parentid == 7638
replace parentname = "RIVERSPRING HEALTH PLANS" if parentid == 65787


*** Changes in pull 3

* pbmid: 3 companies had further name changes
replace pbmname = "GLOBAL PHARMACEUTICAL BENEFITS" if pbmid == 5376
replace pbmname = "GAINWELL TECHNOLOGIES" if pbmid == 6189
replace pbmname = "SCRIPIUS" if pbmid == 65099

* controllerid: 5 companies had further name changes
replace controllername = "CIGNA CORPORATION" if controllerid == 3164
replace controllername = "GLOBAL PHARMACEUTICAL BENEFITS" if controllerid == 5376
replace controllername = "HENNEPIN HEALTH" if controllerid == 7444
replace controllername = "BAYLOR SCOTT AND WHITE HEALTH" if controllerid == 7497
replace controllername = "HEALTHYDAKOTA MUTUAL HOLDINGS" if controllerid == 17161

* parentid: 4 companies with further name changes
replace parentname = "CIGNA CORPORATION" if parentid == 3164
replace parentname = "HENNEPIN HEALTH" if parentid == 7444
replace parentname = "BAYLOR SCOTT AND WHITE HEALTH" if parentid == 7497
replace parentname = "HEALTHYDAKOTA MUTUAL HOLDINGS" if parentid == 17161

*** Changes in pull 4 (the last add-on data)
replace pbmname = "SS&C HEALTH" if pbmid == 3995
replace pbmname = "CARELONRX" if pbmid == 69652

replace controllername = "ELEVANCE HEALTH, INC." if controllerid == 3181
replace controllername = "VNS HEALTH" if controllerid == 7665
replace controllername = "MASS GENERAL BRIGHAM HEALTH PLAN" if controllerid == 60545

replace parentname = "ELEVANCE HEALTH, INC." if parentid == 3181
replace parentname = "VNS HEALTH" if parentid == 7665
replace parentname = "MASS GENERAL BRIGHAM HEALTH PLAN" if parentid == 60545

* Multiple formularies and plans changed names, both in pull 2 and 3. We 
* backfill the latest name from pull 3. (Notice that a handful of formularies 
* and plans with different IDs have the same name. We assume these are different 
* formularies and plans).
foreach var in formulary plan {
	bysort `var'id `var'name : gen ind = _n == 1
	bysort `var'id : egen tot`var'Names = total(ind)
	bysort `var'id (m pull) : ///
		replace `var'name = `var'name[_N] if tot`var'Names > 1
	drop ind tot`var'Names
	}

////////////////////////////////////////////////////////////////////////////////
////																		////
////	STEP 3. Downsize the data to be identified by planid and month		////
////																		////
////////////////////////////////////////////////////////////////////////////////
	
* Drop duplicate information across pulls. At this stage, we can check that 
* planid, plantype, and notlistedpolicy uniquely identify observations for each
* pull
forvalues p = 1/3 {
	qui unique planid plantype notlistedpolicy m if pull == `p'
	if r(N) > r(unique) disp "ERROR"
	}

bysort planid plantype notlistedpolicy m (pull) : keep if _n == _N

* for each inconsistency in plantype and notlistedpolicy across pulls we keep 
* the information in the latest pull
bysort planid plantype m (pull) : keep if _n == _N
bysort planid m (pull) : keep if _n == _N

* drop string variables and adopt IDs with labels (can only do after reconciling
* [var]id and [var]name variables) 
foreach var in controller parent pbm formulary {
	labmask `var'id, values(`var'name)
	drop `var'name
	}

drop planname 	// labmask fails on planname because there are too many values,
				// so we just drop it


////////////////////////////////////////////////////////////////////////////////
////																		////
////					STEP 4. Fill in missing years						////
////																		////
////////////////////////////////////////////////////////////////////////////////
xtset planid m
tsfill

foreach var of varlist _all {
	if "`var'" == "planid" | "`var'" == "m" continue
	bysort planid (m) : replace `var' = `var'[_n-1] if `var' == . & `var'[_n-1] != .
	}
	
* compress
compress
